Setup

library(SeqArray)
library(SNPRelate)
library(pander)
library(scales)
library(magrittr)
library(tidyverse)
library(readxl)
library(sp)
library(ggmap)
library(rgdal)
library(ggsn)
library(parallel)
library(qqman)
library(ggrepel)
library(plyranges)
library(rtracklayer)
theme_set(theme_bw())
panderOptions("missing", "")
mc <- min(12, detectCores() - 1)
alpha <- 0.05
w <- 169/25.4
h <- 169/25.4

This workflow immediately follows 03_SNPFiltering and performs an analysis on the SNPs retained after filtering. Analysis performed is:

gdsPath <- file.path("..", "5_stacks", "gds", "populations.snps.gds")
gdsFile <- seqOpen(gdsPath, readonly = TRUE)
keepSNPs <- readRDS("keepSNPsAfterLDPruning.RDS")
sampleID <- tibble(
    Sample = seqGetData(gdsFile, "sample.annotation/Sample"),
    Population = seqGetData(gdsFile, "sample.annotation/Population"),
    Location = seqGetData(gdsFile, "sample.annotation/Location")
)
popSizes <- sampleID %>% 
    group_by(Population) %>%
    summarise(n = dplyr::n())
snps <- tibble(
    variant.id = seqGetData(gdsFile, "variant.id") ,
    chromosome = seqGetData(gdsFile, "chromosome"),
    position = seqGetData(gdsFile, "position"),
    snpID = seqGetData(gdsFile, "annotation/id")
) %>%
    mutate(snpID = str_remove(snpID, ":.$"),
           snpID = str_replace(snpID, ":", "_")) %>%
    separate(snpID, into = c("Locus ID", "Col"), remove = FALSE) %>%
    mutate_at(vars(`Locus ID`, Col), as.integer) %>%
    mutate(Col = Col - 1,
           snpID = paste(`Locus ID`, Col, sep = "_")) %>%
    right_join(keepSNPs)
seqSetFilter(gdsFile, variant.id = snps$variant.id)
## # of selected variants: 18,886
genotypes <- snps %>%
    cbind(seqGetData(gdsFile, "genotype") %>% 
              apply(MARGIN = 3, colSums) %>%
              t %>%
              set_colnames(sampleID$Sample)) %>%
    as_tibble() %>%
    gather(Sample, Genotype, one_of(sampleID$Sample)) %>%
    dplyr::filter(!is.na(Genotype)) %>%
    arrange(variant.id, Sample) %>%
    left_join(sampleID)
seqResetFilter(gdsFile)
## # of selected samples: 146
## # of selected variants: 146,763
snpIn1996 <- genotypes %>% 
    filter(Population == 1996) %>% 
    group_by(variant.id) %>% 
    summarise(maf = mean(Genotype) / 2) %>%
    filter(maf > 0)
genotypes %<>% 
    right_join(snpIn1996)
seqClose(gdsFile)

In addition to the above, the set of genes corresponding the Ensembl Release 84 were loaded.

ensGenes <- file.path("..", "external", "Oryctolagus_cuniculus.OryCun2.0.84.gff3.gz") %>%
    import.gff3(feature.type = "gene", sequenceRegionsAsSeqinfo = TRUE) %>%
    .[,c("gene_id", "Name", "description")]

The set of SNPs under invetigation was also defined as a GRanges object.

snpsGR <- makeGRangesFromDataFrame(
    df = snps, 
    keep.extra.columns = TRUE, 
    ignore.strand = TRUE, 
    seqinfo = seqinfo(ensGenes), 
    seqnames.field = "chromosome", 
    start.field = "position", 
    end.field = "position"
)

This was intersected with the set of genes to connect SNPs to genes within 40kb.

snp2Gene <- snpsGR %>% 
    resize(width = 199999) %>%
    trim() %>%
    join_overlap_inner(ensGenes) 

PCA

lowCall <- c("gc2901", "gc2776", "gc2731", "gc2727", "gc2686")
snp4PCA <- genotypes %>% 
    filter(!Sample %in% lowCall) %>%
    group_by(variant.id, Population) %>% 
    summarise(n = dplyr::n()) %>% 
    spread(Population, n) %>%
    mutate(N = `1996` + `2010` + `2012`) %>% 
    ungroup() %>%
    filter(N > 0.95*(sum(popSizes$n) - length(lowCall)))
pca <- genotypes %>%
    filter(variant.id %in% snp4PCA$variant.id) %>%
    dplyr::select(variant.id, Sample, Genotype) %>%
    spread(Sample, Genotype) %>%
    as.data.frame() %>%
    column_to_rownames("variant.id") %>%
    as.matrix() %>%
    apply(2, function(x){
        x[is.na(x)] <- mean(x, na.rm = TRUE)
        x
    }) %>%
    t() %>%
    .[, apply(., 2, function(x){length(unique(x)) > 1})] %>%
    prcomp( center = TRUE)

As noted in the previous section sample samples gc2901, gc2776, gc2731, gc2727 and gc2686 had a SNP identification rate \(< 50\)% and as such these were markd as potential outliers. Ignoring these samples, and restricting data to SNPs identified in \(>95\)% of all samples, a preliminary PCA was performed This amounted to 7,767 of the possible 18,878 SNPs for analysis using PCA. Missing values were specified as the mean MAF over all populations combined.

Given the initially observed structure, in which samples from the 2012 population are separating from the other samples which group with the 1996 population, the collection points for the 2012 samples as investigated.

sampleID %<>%
    left_join(
        file.path("..", "external", "GPS_Locations.xlsx") %>%
            read_excel() %>%
            dplyr::select(Sample, ends_with("tude")) %>%
            mutate(Sample = gsub("[Oo][Rr][Aa] ([0-9ABC]*)", "ora\\1", Sample))
    )
*PCA showing population structure. Point size reflects the proportion of SNPs for which imputation was required, and the observed structure appeared independent of this.*

PCA showing population structure. Point size reflects the proportion of SNPs for which imputation was required, and the observed structure appeared independent of this.

loc <- c(range(sampleID$Longitude, na.rm = TRUE) %>% mean,
         range(sampleID$Latitude, na.rm = TRUE) %>% mean)
saPoly <- readRDS(file.path("..", "external", "saPoly.RDS"))
roads <- readRDS(file.path("..", "external", "roads.RDS"))
gc <- SpatialPoints(cbind(x = 138.655972, y = -31.200305))
proj4string(gc) <- "+proj=longlat +ellps=GRS80 +no_defs"
xBreaks <- seq(138.65, 138.8, by = 0.05)
xLabs <- parse(text = paste(xBreaks, "*degree ~ E", sep = ""))
yBreaks <- seq(-31.2, -31.32, by = -0.04)
yLabs <- parse(text = paste(-yBreaks, "*degree ~ S", sep = ""))
leftN <- tibble(x = c(138.7965, 138.8, 138.8) - 0.01,
                    y = c(-31.2, -31.198, -31.193))
rightN <- tibble(x = c(138.8, 138.8, 138.8035) - 0.01,
                     y = c(-31.193, -31.198, -31.2))
ggMap <- get_map(loc, zoom = 12, maptype = "terrain", color = "bw")
zoomPlot <- ggmap(ggMap, extent = "normal", maprange = FALSE) +
    geom_path(
        aes(long, lat, group = group), 
        data = subset(roads, SURFACE == "UNSE"), 
        linetype = 2, size = 0.3) + 
    geom_path(
        aes(long, lat, group = group), 
        data = subset(roads, SURFACE != "UNSE"), 
        linetype = 1, size = 0.4) +
    geom_label(x = 138.74, y = -31.29, label = "Flinders Ranges NP", alpha = 0.4) +
    geom_label(x = 138.72, y = -31.22, label = "Gum Creek", alpha = 0.4) +
    geom_point(aes(x, y), data = as.data.frame(gc), shape = 3, size = 3) +
    geom_text(aes(x, y), label = "HS", data = as.data.frame(gc), nudge_y = 0.005) +
    geom_polygon(
        data = subset(saPoly, F_CODE == "HD"),
        aes(long, lat, group = group),
        fill = rgb(1, 1, 1, 0), colour = "grey10", size = 0.3) +
    geom_polygon(
        data = subset(saPoly, F_CODE == "PARK"),
        aes(long, lat, group = group),
        fill = rgb(1, 1, 1, 0), colour = "grey10", size = 0.2) +
    geom_point(
        data= filter(pca4Plot, grepl("2012", Population)),
        aes(Longitude, Latitude, colour = Population),
        size = 0.9*ps) +
    geom_polygon(aes(x, y), data = leftN, fill = "white", colour = "grey10", size = 0.4) +
    geom_polygon(aes(x, y), data = rightN, fill = "grey10", colour = "grey10", size = 0.4) +
    geom_text(x = 138.79, y = -31.19, label = "N", 
              colour = "grey10", size = 4) +
    scale_colour_manual(values = popCols[2:3]) +
    coord_cartesian(xlim = c(138.618, 138.81),
                    ylim = c(-31.335, -31.18),
                    expand = 0) +
    scale_x_continuous(breaks = xBreaks, labels = xLabs) +
    scale_y_continuous(breaks = yBreaks, labels = yLabs) +
    guides(colour = FALSE) +
    ggsn::scalebar(x.min = 138.618, x.max = 138.81,
                   y.min = -31.335, y.max = -31.18,
                   transform = TRUE,
                   dist = 2, dist_unit = "km",
                   model = 'GRS80',
                   height = 0.012, st.size = 4,
                   location = 'bottomright',
                   anchor = c(x = 138.8, y = -31.328)) +
    labs(x = "Longitude",
         y = "Latitude") +
    theme(text = element_text(size = fs),
          plot.margin = unit(c(1, 1, 1, 1), "mm"))
ausPolygon <- readRDS(file.path("..", "external", "ausPolygon.RDS"))
ausPts <- SpatialPoints(cbind(x = loc[1], y = loc[2]))
proj4string(ausPts) <- proj4string(ausPolygon)
ausPlot <- ggplot() +
  geom_polygon(data = ausPolygon, 
               aes(long, lat, group = group), fill = "white", colour = "black") + 
  geom_point(data = as.data.frame(ausPts), aes(x, y), size = 1.5) +
  theme_void() +
  theme(plot.background = element_rect(fill = "white", colour = "black"))
## png 
##   2
*Figure 1: Collection points for all 2012 samples with colours showing sub-populations initially defined by PCA analysis and *k*-means clustering.*

Figure 1: Collection points for all 2012 samples with colours showing sub-populations initially defined by PCA analysis and k-means clustering.

zoomLoc <- c(138.753, -31.242)
xBreaks <- seq(138.74, 138.76, by = 0.01)
xLabs <- parse(text = paste(xBreaks, "*degree ~ W", sep = ""))
yBreaks <- seq(-31.235, -31.25, by = -0.005)
yLabs <- parse(text = paste(-yBreaks, "*degree ~ S", sep = ""))
central <- rbind(x = c(138.749, 138.755),
                 y = c(-31.2365, -31.2495)) %>%
  set_colnames(c("min", "max"))
leftN <- tibble(x = c(138.7595, 138.76, 138.76),
                    y = c(-31.234, -31.2335, -31.2325))
rightN <- tibble(x = c(138.76, 138.76, 138.7605),
                     y = c(-31.2325, -31.2335, -31.234))
get_map(zoomLoc, zoom = 15, maptype = "terrain", source = "google", color = "bw") %>%
    ggmap() +
    geom_jitter(
        data = filter(pca4Plot, grepl("2012", Population)), 
        aes(Longitude, Latitude, colour = Population), 
        size = 3, width = 0.0005, height = 0) +
    geom_rect(
        xmin = central["x", "min"],
        xmax = central["x", "max"],
        ymin = central["y", "min"],
        ymax = central["y", "max"],
        fill = "red", alpha = 0.01, colour = "black") +
    geom_polygon(aes(x, y), data = leftN, fill = "white", colour = "grey10", size = 0.4) +
    geom_polygon(aes(x, y), data = rightN, fill = "grey10", colour = "grey10", size = 0.4) +
    geom_text(x = 138.76, y = -31.232, label = "N", 
              colour = "grey10", size = 5) +
    scale_colour_manual(values = popCols[2:3]) +
    scale_x_continuous(breaks = xBreaks, labels = xLabs) +
    scale_y_continuous(breaks = yBreaks, labels = yLabs) +
    theme_bw() +
    guides(colour = FALSE) +
    labs(x = "Longitude",
         y = "Latitude") +
    coord_cartesian(xlim = c(138.74, 138.762),
                    ylim = c(-31.253, -31.231),
                    expand = 0) +
    ggsn::scalebar(
        x.min = 138.74, x.max = 138.762, 
        y.min = -31.253, y.max = -31.231,
        transform = TRUE,
        dist = 0.25, dist_unit = "km",, model = 'GRS80', 
        height = 0.012, st.size = 4,
        location = 'bottomright',
        anchor = c(x = 138.761, y = -31.252)
    )
*Zoomed-in view of the central region for 2012 samples with colours showing sub-populations defined by PCA analysis. The region considered to be the Central Region is shaded in red. Due to overlapping GPS points a small amount of jitter has been added to the x-axis.*

Zoomed-in view of the central region for 2012 samples with colours showing sub-populations defined by PCA analysis. The region considered to be the Central Region is shaded in red. Due to overlapping GPS points a small amount of jitter has been added to the x-axis.

Region Analysis

Removal of SNPs Associated with Collection Region

The structure observed within the 2012 population in the PCA could possibly be explained by recent migration into this region. As the samples collected in the outer regions appeared very similar to the 1996 population in the above plots, this would possibly indicate migration a very recent event as the genetic influence of this has not spread through the wider area. Although this may be due to other factors such as sampling bias, this structure was addressed by identifying SNPs which showed an association with the sub-populations identified by PCA analysis. In this way, any candidate SNPs obtained below will be less impacted by this structure, and will be more reflective of the intended variable under study, i.e. selection over time, as opposed to any internal structure of the 2012 population.

oraRegions <- pca4Plot %>% 
  filter(grepl("2012", Population)) %>% 
  rowwise() %>%
  mutate(yCentral = cut(Latitude, breaks = central["y",], include.lowest = TRUE), 
         xCentral = cut(Longitude, breaks = central["x",], include.lowest = TRUE),
         Central = (is.na(yCentral) + is.na(xCentral)) == 0) %>%
  dplyr::select(Sample, Central) 

Testing for Structure in 2012

This model tests:
H0: No association between genotypes and collection region
HA: An association exists between genotypes and collection region

regionResults <- genotypes %>%
    filter(Population == 2012) %>%
    split(f = .$variant.id) %>%
    mclapply(function(x){
        ft <- list(p.value = NA)
        if (length(unique(x$Genotype)) > 1) {
            ft <- x %>%
                left_join(oraRegions) %>%
                group_by(Genotype, Central) %>%
                tally() %>%
                spread(Genotype, n, fill = 0) %>%
                column_to_rownames("Central") %>%
                fisher.test()
        }
        x %>%
            distinct(variant.id, snpID, chromosome, position) %>%
            mutate(p = ft$p.value)
    }, mc.cores = mc) %>%
    bind_rows() %>%
    filter(!is.na(p)) %>%
    arrange(p)

A total of 1526 SNPs were detected as showing a significant association between genotype and the collection region. Under H0, the number expected using α = 0.05 would be 943, and as this number was approximately double that expected, this was taken as evidence of this being a genuine point of concern for this dataset.

Notably, Type II errors were of principle concern in this instance, and as such every SNP with p < 0.05 in the above test was excluded from downstream analysis.

regionSNPs <- filter(regionResults, p < 0.05)
saveRDS(regionSNPs, "regionSNPs.RDS")

Under this additional filtering step, the original set of 17076 SNPs will be reduced to 15,550 for testing by genotype and allele frequency.

Verification Of Removal

In order to verify that the removal of the above SNPs removed the undesired population structure from the 2012 population, the above PCA was repeated, excluding the SNPs marked for removal. The previous structure noted in the data was no longer evident, and as such, these SNPs were marked for removal during analysis by genotype and allele frequency.

pcaPost <- genotypes %>%
    filter(variant.id %in% snp4PCA$variant.id,
           !variant.id %in% regionSNPs$variant.id) %>%
    dplyr::select(variant.id, Sample, Genotype) %>%
    spread(Sample, Genotype) %>%
    as.data.frame() %>%
    column_to_rownames("variant.id") %>%
    as.matrix() %>%
    apply(2, function(x){
        x[is.na(x)] <- mean(x, na.rm = TRUE)
        x
    }) %>%
    t() %>%
    .[, apply(., 2, function(x){length(unique(x)) > 1})] %>%
    prcomp( center = TRUE) 
pcaPost4Plot <- pcaPost$x %>%
    as.data.frame() %>%
    rownames_to_column("Sample") %>%
    as_tibble() %>%
    dplyr::select(Sample, PC1, PC2, PC3) %>%
    left_join(sampleID) %>%
    mutate(Cluster = kmeans(cbind(PC1, PC2, PC3), 3)$cluster) %>%
    group_by(Cluster) %>%
    mutate(maxY = max(Latitude, na.rm = TRUE)) %>%
    ungroup() %>%
    mutate(Population = case_when(
        Population == 1996 ~ "1996",
        Population == 2010 ~ "Outgroup (Turretfield)",
        maxY == max(maxY) ~ "2012 (Outer)",
        maxY != max(maxY) ~ "2012 (Central)"
    )) %>%
    left_join(genotypes %>% 
                  filter(variant.id %in% snp4PCA$variant.id,
                         !Sample %in% lowCall) %>% 
                  group_by(Sample) %>% 
                  tally() %>% 
                  mutate(imputationRate = 1 - n / nrow(snp4PCA))) 
*Figure 2: Principal Components Analysis showing structures before removal of SNPs denoting collection region in the 2012 population, and after removal of these SNPs*

Figure 2: Principal Components Analysis showing structures before removal of SNPs denoting collection region in the 2012 population, and after removal of these SNPs

SNP Analysis

Genotype Frequency Model

This model tests:
H0: No association between genotypes and populations
HA: An association exists between genotypes and populations

genotypeResults <- genotypes %>%
    filter(Population != 2010,
           !variant.id %in% regionSNPs$variant.id) %>%
    group_by(variant.id, snpID, Population, Genotype) %>%
    tally() %>%
    ungroup() %>%
    split(f = .$variant.id) %>%
    mclapply(function(x){
        ft <- list(p.value = NA)
        if (length(unique(x$Genotype)) > 1) {
            ft <- x %>%
                spread(Genotype, n, fill = 0) %>%
                column_to_rownames("Population") %>%
                dplyr::select(-variant.id, -snpID) %>%
                fisher.test()
        }
        x %>% 
            distinct(variant.id) %>%
            mutate(p = ft$p.value)
    },mc.cores = mc) %>%
    bind_rows() %>%
    filter(!is.na(p)) %>%
    mutate(FDR = p.adjust(p, "fdr"),
           adjP = p.adjust(p, "bonferroni")) %>%
    arrange(p) %>%
    left_join(genotypes %>%
                  distinct(variant.id, snpID, chromosome, position)) %>%
    dplyr::select(variant.id, snpID, chromosome, position, everything())

Under the full genotype model:

  • 15 genotypes were detected as being significantly associated with the two populations when controlling the FWER at α = 0.05
  • 38 genotypes were detected as being significantly associated with the two populations when controlling the FDR at α = 0.05
  • For the most highly ranked SNP (3391201_92), the minor allele has been completely lost in the 2012 population
SNPs with adjusted p-values < 0.05 when analysing by genotype.
snpID chromosome position p FDR adjP Gene within 100kb
3391201_92 GL018705 1,937,359 6.103e-08 0.0009118 0.001057
686773_29 3 90,851,512 1.053e-07 0.0009118 0.001824 CRISPLD1
686773_29 3 90,851,512 1.053e-07 0.0009118 0.001824 SGF29
916731_91 4 89,602,973 1.865e-07 0.0009927 0.00323 RIC8B
916731_91 4 89,602,973 1.865e-07 0.0009927 0.00323 RFX4
836950_151 4 35,111,855 2.587e-07 0.0009927 0.00448 TMPRSS12
836950_151 4 35,111,855 2.587e-07 0.0009927 0.00448 METTL7A
836950_151 4 35,111,855 2.587e-07 0.0009927 0.00448 SLC11A2
321186_117 2 2.8e+07 2.866e-07 0.0009927 0.004963 KLF3
2227006_109 13 129,650,867 4.419e-07 0.001061 0.007652 ID3
2227006_109 13 129,650,867 4.419e-07 0.001061 0.007652 E2F2
2227006_109 13 129,650,867 4.419e-07 0.001061 0.007652 ASAP3
2227006_109 13 129,650,867 4.419e-07 0.001061 0.007652 TCEA3
2227006_109 13 129,650,867 4.419e-07 0.001061 0.007652 ZNF436
3040789_114 19 30,689,426 4.838e-07 0.001061 0.008378 MSI2
4165193_106 GL019119 147,630 4.901e-07 0.001061 0.008487
4223149_99 GL019332 5,745 6.122e-07 0.001178 0.0106
1089925_112 7 35,584,375 9.805e-07 0.001597 0.01698
464226_128 2 125,603,458 1.014e-06 0.001597 0.01757 BCL11A
3374849_109 GL018704 1,294,009 1.387e-06 0.002002 0.02403
3695415_108 GL018751 737,318 1.837e-06 0.002417 0.03181 TRAF3
3695415_108 GL018751 737,318 1.837e-06 0.002417 0.03181 RCOR1
2099634_108 13 45,086,624 2.028e-06 0.002417 0.03512 WARS2
2099634_108 13 45,086,624 2.028e-06 0.002417 0.03512 TBX15
1902686_98 12 60,113,885 2.094e-06 0.002417 0.03626
1902686_98 12 60,113,885 2.094e-06 0.002417 0.03626
1902686_98 12 60,113,885 2.094e-06 0.002417 0.03626
1902686_98 12 60,113,885 2.094e-06 0.002417 0.03626 DPPA5
1902686_98 12 60,113,885 2.094e-06 0.002417 0.03626 KHDC3L
1902686_98 12 60,113,885 2.094e-06 0.002417 0.03626
1902686_98 12 60,113,885 2.094e-06 0.002417 0.03626
1902686_98 12 60,113,885 2.094e-06 0.002417 0.03626 MB21D1
Figure 3a: Manhattan plot showing results for all SNPs on chromosomes 1:21 for analysis by genotype. The horizontal line indicates the cutoff for an FDR of 5%, with SNPs with an adjusted p-value < 0,05 shown in green

Figure 3a: Manhattan plot showing results for all SNPs on chromosomes 1:21 for analysis by genotype. The horizontal line indicates the cutoff for an FDR of 5%, with SNPs with an adjusted p-value < 0,05 shown in green

*Variants detected as significant to an FDR of 5%. Those denoted with an asterisk received a Bonferroni-adjusted p-value < 0.05 and these typically involved reduction of the minor allele frequency and a shift towards homozygous reference. The remaining sites showed a combination of the same and increasing minor allele abundance, with some variants becoming exclusively heterozygous in the 2012 population.*

Variants detected as significant to an FDR of 5%. Those denoted with an asterisk received a Bonferroni-adjusted p-value < 0.05 and these typically involved reduction of the minor allele frequency and a shift towards homozygous reference. The remaining sites showed a combination of the same and increasing minor allele abundance, with some variants becoming exclusively heterozygous in the 2012 population.

Allele Frequency Model

This model tests:
H0: No association between allele frequencies and populations
HA: An association exists between allele frequencies and populations

alleleResults <- genotypes %>%
    filter(Population != 2010,
           !variant.id %in% regionSNPs$variant.id) %>%
    group_by(snpID, Population) %>%
    summarise(P = sum(2 - Genotype),
              Q = sum(Genotype)) %>%
    ungroup() %>%
    split(f = .$snpID) %>%
    mclapply(function(x){
        m <- as.matrix(x[c("P", "Q")])
        ft <- list(p.value = NA) 
        if (length(m) == 4) {
            ft <- fisher.test(m)
        }
        x %>%
            mutate(MAF = Q / (P + Q)) %>%
            dplyr::select(snpID, Population, MAF) %>%
            spread(Population, MAF) %>%
            mutate(p = ft$p.value)
    },mc.cores = mc) %>%
    bind_rows() %>%
    filter(!is.na(p)) %>%
    dplyr::rename(MAF_1996 = `1996`,
                  MAF_2012 = `2012`) %>%
    mutate(FDR = p.adjust(p, "fdr"),
           adjP = p.adjust(p, "bonferroni")) %>%
    arrange(p) %>%
    left_join(genotypes %>%
                  distinct(snpID, chromosome, position)) %>%
    dplyr::select(snpID, chromosome, position, everything())

Under this model:

  • 5 SNP alleles were detected as being significantly associated with the two populations when controlling the FWER at α = 0.05. However, as these SNPs were within 21nt of each other, this may represent the same haplotype
  • 38 SNP alleles were detected as being significantly associated with the two populations when controlling the FDR at α = 0.05
SNPs considered as significant when analysing by genotype using an FDR cutoff of 0.05
snpID chromosome position MAF_1996 MAF_2012 p FDR adjP Gene within 100kb
3391201_92 GL018705 1,937,359 0.2255 0.009615 3.44e-07 0.005965 0.00597
916731_91 4 89,602,973 0.1961 0 6.875e-07 0.005965 0.01193 RIC8B
916731_91 4 89,602,973 0.1961 0 6.875e-07 0.005965 0.01193 RFX4
3040789_114 19 30,689,426 0.1863 0 1.531e-06 0.007697 0.02656 MSI2
321186_117 2 2.8e+07 0.25 0.0122 1.866e-06 0.007697 0.03238 KLF3
1009106_1 6 9,037,073 0.4432 0.1277 2.52e-06 0.007697 0.04372 SMG1
1009106_1 6 9,037,073 0.4432 0.1277 2.52e-06 0.007697 0.04372 ARL6IP1
836950_151 4 35,111,855 0.2907 0.04082 3.086e-06 0.007697 0.05355 TMPRSS12
836950_151 4 35,111,855 0.2907 0.04082 3.086e-06 0.007697 0.05355 METTL7A
836950_151 4 35,111,855 0.2907 0.04082 3.086e-06 0.007697 0.05355 SLC11A2
464226_128 2 125,603,458 0.1915 0 3.105e-06 0.007697 0.05388 BCL11A
2227006_109 13 129,650,867 0.2708 0.03333 4.172e-06 0.008442 0.07239 ID3
2227006_109 13 129,650,867 0.2708 0.03333 4.172e-06 0.008442 0.07239 E2F2
2227006_109 13 129,650,867 0.2708 0.03333 4.172e-06 0.008442 0.07239 ASAP3
2227006_109 13 129,650,867 0.2708 0.03333 4.172e-06 0.008442 0.07239 TCEA3
2227006_109 13 129,650,867 0.2708 0.03333 4.172e-06 0.008442 0.07239 ZNF436
1902686_98 12 60,113,885 0.1633 0 4.749e-06 0.008442 0.0824
1902686_98 12 60,113,885 0.1633 0 4.749e-06 0.008442 0.0824
1902686_98 12 60,113,885 0.1633 0 4.749e-06 0.008442 0.0824
1902686_98 12 60,113,885 0.1633 0 4.749e-06 0.008442 0.0824 DPPA5
1902686_98 12 60,113,885 0.1633 0 4.749e-06 0.008442 0.0824 KHDC3L
1902686_98 12 60,113,885 0.1633 0 4.749e-06 0.008442 0.0824
1902686_98 12 60,113,885 0.1633 0 4.749e-06 0.008442 0.0824
1902686_98 12 60,113,885 0.1633 0 4.749e-06 0.008442 0.0824 MB21D1
3374849_109 GL018704 1,294,009 0.2065 0.0102 4.865e-06 0.008442 0.08442
4165193_106 GL019119 147,630 0.2736 0.04167 5.576e-06 0.008796 0.09676
4044777_119 GL018933 204,058 0.01754 0.2019 7.749e-06 0.01121 0.1345 FNBP1
4044777_119 GL018933 204,058 0.01754 0.2019 7.749e-06 0.01121 0.1345 USP20
4044777_119 GL018933 204,058 0.01754 0.2019 7.749e-06 0.01121 0.1345 C9orf78
4044777_119 GL018933 204,058 0.01754 0.2019 7.749e-06 0.01121 0.1345 TOR1A
4044777_119 GL018933 204,058 0.01754 0.2019 7.749e-06 0.01121 0.1345 TOR1B
4044777_119 GL018933 204,058 0.01754 0.2019 7.749e-06 0.01121 0.1345 PTGES
4044777_119 GL018933 204,058 0.01754 0.2019 7.749e-06 0.01121 0.1345 PRRX2
1089925_112 7 35,584,375 0.2841 0.04348 9.725e-06 0.01298 0.1687
3695415_108 GL018751 737,318 0.2609 0.03261 1.275e-05 0.0146 0.2213 TRAF3
3695415_108 GL018751 737,318 0.2609 0.03261 1.275e-05 0.0146 0.2213 RCOR1
4223149_99 GL019332 5,745 0.3077 0.05814 1.314e-05 0.0146 0.2281
2099634_108 13 45,086,624 0.2609 0.03409 1.412e-05 0.0146 0.2451 WARS2
2099634_108 13 45,086,624 0.2609 0.03409 1.412e-05 0.0146 0.2451 TBX15
87917_119 1 64,635,286 0.186 0 1.43e-05 0.0146 0.2481 CEP78
87917_119 1 64,635,286 0.186 0 1.43e-05 0.0146 0.2481 PSAT1
4010189_98 GL018907 283,766 0.1932 0.01 1.627e-05 0.01508 0.2823 BRWD1
4010189_98 GL018907 283,766 0.1932 0.01 1.627e-05 0.01508 0.2823
4010189_98 GL018907 283,766 0.1932 0.01 1.627e-05 0.01508 0.2823 LCA5L
4010189_98 GL018907 283,766 0.1932 0.01 1.627e-05 0.01508 0.2823 SH3BGR
2689075_49 16 56,527,308 0.05814 0.3043 1.652e-05 0.01508 0.2866 ESRRG
4149102_107 GL019077 34,265 0.2232 0.02941 2.279e-05 0.01977 0.3954 IFT140
4149102_107 GL019077 34,265 0.2232 0.02941 2.279e-05 0.01977 0.3954 TMEM204
4149102_107 GL019077 34,265 0.2232 0.02941 2.279e-05 0.01977 0.3954 CRAMP1
4149102_107 GL019077 34,265 0.2232 0.02941 2.279e-05 0.01977 0.3954
1997286_93 12 141,844,290 0.08889 0.3556 2.435e-05 0.02005 0.4225 ESR1
2906428_79 18 25,311,176 0.1731 0.4434 2.542e-05 0.02005 0.4411 CDK1
2405359_66 14 97,813,263 0.02083 0.2021 4.48e-05 0.0338 0.7774 STXBP5L
3000704_65 19 13,055,154 0.03191 0.234 5.416e-05 0.0381 0.9398
4146664_90 GL019084 74,038 0.1961 0.02041 5.49e-05 0.0381 0.9526 GNB1
4146664_90 GL019084 74,038 0.1961 0.02041 5.49e-05 0.0381 0.9526 NADK
4146664_90 GL019084 74,038 0.1961 0.02041 5.49e-05 0.0381 0.9526
4098854_101 GL018985 129,495 0.1702 0.01042 6.686e-05 0.04298 1
4098854_101 GL018985 129,495 0.1702 0.01042 6.686e-05 0.04298 1 UFD1L
4098854_101 GL018985 129,495 0.1702 0.01042 6.686e-05 0.04298 1 C22orf39
4098854_101 GL018985 129,495 0.1702 0.01042 6.686e-05 0.04298 1 MRPL40
4098854_101 GL018985 129,495 0.1702 0.01042 6.686e-05 0.04298 1 HIRA
4098838_14 GL018985 123,328 0.3636 0.1275 6.797e-05 0.04298 1
4098838_14 GL018985 123,328 0.3636 0.1275 6.797e-05 0.04298 1 UFD1L
4098838_14 GL018985 123,328 0.3636 0.1275 6.797e-05 0.04298 1 C22orf39
4098838_14 GL018985 123,328 0.3636 0.1275 6.797e-05 0.04298 1 MRPL40
4098838_14 GL018985 123,328 0.3636 0.1275 6.797e-05 0.04298 1 HIRA
1882774_17 12 40,509,779 0.3214 0.5943 7.288e-05 0.04298 1
1045185_75 7 2,702,919 0.2143 0.03774 7.807e-05 0.04298 1
1045185_75 7 2,702,919 0.2143 0.03774 7.807e-05 0.04298 1 ZNF212
1045185_75 7 2,702,919 0.2143 0.03774 7.807e-05 0.04298 1 ZNF282
1045185_75 7 2,702,919 0.2143 0.03774 7.807e-05 0.04298 1 ZNF398
3016258_99 19 19,178,148 0.22 0.03261 7.861e-05 0.04298 1 SEZ6
3016258_99 19 19,178,148 0.22 0.03261 7.861e-05 0.04298 1 PHF12
3016258_99 19 19,178,148 0.22 0.03261 7.861e-05 0.04298 1 DHRS13
3016258_99 19 19,178,148 0.22 0.03261 7.861e-05 0.04298 1 FLOT2
3016258_99 19 19,178,148 0.22 0.03261 7.861e-05 0.04298 1 ERAL1
3016258_99 19 19,178,148 0.22 0.03261 7.861e-05 0.04298 1 FAM222B
2028473_73 13 2,979,383 0.01818 0.1731 8.051e-05 0.04298 1 RFWD2
3829050_109 GL018791 957,960 0.163 0.009615 8.094e-05 0.04298 1 FHAD1
3829050_109 GL018791 957,960 0.163 0.009615 8.094e-05 0.04298 1 TMEM51
2580149_42 15 87,473,844 0.3511 0.1154 8.416e-05 0.04298 1 ADGRL3
2527760_104 15 43,156,117 0.1531 0.4057 8.594e-05 0.04298 1
2527760_104 15 43,156,117 0.1531 0.4057 8.594e-05 0.04298 1 AIMP1
2527760_104 15 43,156,117 0.1531 0.4057 8.594e-05 0.04298 1 TBCK
3999128_75 GL018883 283,912 0.3654 0.125 8.683e-05 0.04298 1 SERPINA6
3999128_75 GL018883 283,912 0.3654 0.125 8.683e-05 0.04298 1
3999128_75 GL018883 283,912 0.3654 0.125 8.683e-05 0.04298 1
3999128_75 GL018883 283,912 0.3654 0.125 8.683e-05 0.04298 1
3999128_75 GL018883 283,912 0.3654 0.125 8.683e-05 0.04298 1
3999128_75 GL018883 283,912 0.3654 0.125 8.683e-05 0.04298 1
3999128_75 GL018883 283,912 0.3654 0.125 8.683e-05 0.04298 1 SERPINA11
3999128_75 GL018883 283,912 0.3654 0.125 8.683e-05 0.04298 1 SERPINA9
3999128_75 GL018883 283,912 0.3654 0.125 8.683e-05 0.04298 1 SERPINA12
4176850_23 GL019154 883 0.234 0.04082 8.916e-05 0.04298 1 MTHFSD
4176850_23 GL019154 883 0.234 0.04082 8.916e-05 0.04298 1 FOXF1
686773_29 3 90,851,512 0.4375 0.1667 9.731e-05 0.04563 1 CRISPLD1
686773_29 3 90,851,512 0.4375 0.1667 9.731e-05 0.04563 1 SGF29
3032107_92 19 26,020,372 0.1939 0.02083 0.0001019 0.04654 1
## png 
##   2
Manhattan plot showing results for all SNPs on chromosomes 1:21 when analysing by allele frequencies. The horizontal line indicates the cutoff for an FDR of 5%, with SNPs considered significant under the Bonferroni adjustment shown in green.

Manhattan plot showing results for all SNPs on chromosomes 1:21 when analysing by allele frequencies. The horizontal line indicates the cutoff for an FDR of 5%, with SNPs considered significant under the Bonferroni adjustment shown in green.

*Comparison of minor allele frequencies in both populations. SNPs with an FDR-adjusted p-value < 0.05 are coloured red, whilst those with a Bonferroni-adjusted p-value are additionally labelled.*

Comparison of minor allele frequencies in both populations. SNPs with an FDR-adjusted p-value < 0.05 are coloured red, whilst those with a Bonferroni-adjusted p-value are additionally labelled.

Analysis Using the FLK model

This was performed seperately and no SNPs of specific interest were detected.

Export of Data for Bayescan

A VCF was required with only the 1996 and 2012 populations, and restricted to the candidate SNPs after pruning for linkage disequilibrium and detection of the allele in the 1996 population.

gdsFile <- seqOpen(gdsPath, readonly = TRUE)
seqSetFilter(gdsFile, 
             variant.id = genotypes %>% 
                 distinct(variant.id, snpID) %>%
                 filter(snpID %in% filter(regionResults, p > 0.05)$snpID) %>% 
                 .[["variant.id"]],
             sample.id = sampleID %>% 
                 filter(Population %in% c(1996, 2012)) %>% 
                 .[["Sample"]])
seqGDS2VCF(gdsFile, "../5_stacks/vcf/filtered.vcf.gz")
seqResetFilter(gdsFile)
seqClose(gdsFile)

Session Information

R version 3.5.3 (2019-03-11)

**Platform:** x86_64-pc-linux-gnu (64-bit)

locale: LC_CTYPE=en_AU.UTF-8, LC_NUMERIC=C, LC_TIME=en_AU.UTF-8, LC_COLLATE=en_AU.UTF-8, LC_MONETARY=en_AU.UTF-8, LC_MESSAGES=en_AU.UTF-8, LC_PAPER=en_AU.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_AU.UTF-8 and LC_IDENTIFICATION=C

attached base packages: stats4, parallel, grid, stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: rtracklayer(v.1.42.2), plyranges(v.1.2.0), GenomicRanges(v.1.34.0), GenomeInfoDb(v.1.18.2), IRanges(v.2.16.0), S4Vectors(v.0.20.1), BiocGenerics(v.0.28.0), ggrepel(v.0.8.0), qqman(v.0.1.4), ggsn(v.0.5.0), rgdal(v.1.4-3), ggmap(v.3.0.0), sp(v.1.3-1), readxl(v.1.3.1), forcats(v.0.4.0), stringr(v.1.4.0), dplyr(v.0.8.0.1), purrr(v.0.3.2), readr(v.1.3.1), tidyr(v.0.8.3), tibble(v.2.1.1), ggplot2(v.3.1.0), tidyverse(v.1.2.1), magrittr(v.1.5), scales(v.1.0.0), pander(v.0.6.3), SNPRelate(v.1.16.0), SeqArray(v.1.22.6) and gdsfmt(v.1.18.1)

loaded via a namespace (and not attached): nlme(v.3.1-137), matrixStats(v.0.54.0), bitops(v.1.0-6), sf(v.0.7-3), lubridate(v.1.7.4), httr(v.1.4.0), tools(v.3.5.3), backports(v.1.1.3), R6(v.2.4.0), DBI(v.1.0.0), lazyeval(v.0.2.2), colorspace(v.1.4-1), withr(v.2.1.2), tidyselect(v.0.2.5), curl(v.3.3), compiler(v.3.5.3), cli(v.1.1.0), rvest(v.0.3.2), Biobase(v.2.42.0), xml2(v.1.2.0), DelayedArray(v.0.8.0), labeling(v.0.3), classInt(v.0.3-1), Rsamtools(v.1.34.1), digest(v.0.6.18), foreign(v.0.8-71), rmarkdown(v.1.12), XVector(v.0.22.0), jpeg(v.0.1-8), pkgconfig(v.2.0.2), htmltools(v.0.3.6), highr(v.0.8), rlang(v.0.3.2), rstudioapi(v.0.10), generics(v.0.0.2), jsonlite(v.1.6), BiocParallel(v.1.16.6), RCurl(v.1.95-4.12), GenomeInfoDbData(v.1.2.0), Matrix(v.1.2-17), Rcpp(v.1.0.1), munsell(v.0.5.0), stringi(v.1.4.3), yaml(v.2.2.0), SummarizedExperiment(v.1.12.0), zlibbioc(v.1.28.0), plyr(v.1.8.4), maptools(v.0.9-5), crayon(v.1.3.4), lattice(v.0.20-38), Biostrings(v.2.50.2), haven(v.2.1.0), hms(v.0.4.2), knitr(v.1.22), pillar(v.1.3.1), rjson(v.0.2.20), XML(v.3.98-1.19), glue(v.1.3.1), evaluate(v.0.13), calibrate(v.1.7.2), modelr(v.0.1.4), png(v.0.1-7), RgoogleMaps(v.1.4.3), cellranger(v.1.1.0), gtable(v.0.3.0), assertthat(v.0.2.1), xfun(v.0.5), broom(v.0.5.1), e1071(v.1.7-1), class(v.7.3-15), GenomicAlignments(v.1.18.1) and units(v.0.6-2)